"""Labelled Evaluation Class."""

import asyncio
import time
from typing import Any, Dict, List, Sequence, Optional

from llama_index.core.bridge.pydantic import Field
from llama_index.core.evaluation import (
    BaseEvaluator,
    EvaluationResult,
)
from llama_index.core.evaluation.pairwise import EvaluationSource
from llama_index.core.llama_dataset.base import (
    BaseLlamaDataExample,
    BaseLlamaDataset,
    BaseLlamaExamplePrediction,
    BaseLlamaPredictionDataset,
    CreatedBy,
)


class EvaluatorExamplePrediction(BaseLlamaExamplePrediction):
    """
    Evaluation example prediction class.

    Args:
        feedback (Optional[str]): The evaluator's feedback.
        score (Optional[float]): The evaluator's score.

    """

    feedback: str = Field(
        default_factory=str,
        description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
    )
    score: Optional[float] = Field(
        default=None,
        description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
    )
    invalid_prediction: bool = Field(
        default=False, description="Whether or not the prediction is a valid one."
    )
    invalid_reason: Optional[str] = Field(
        default=None, description="Reason as to why prediction is invalid."
    )

    @property
    def class_name(self) -> str:
        """Data example class name."""
        return "EvaluatorExamplePrediction"


class LabelledEvaluatorDataExample(BaseLlamaDataExample):
    """
    Evaluation example class.

    This data class contains the ingredients to perform a new "prediction" i.e.,
    evaluation. Here an evaluator is meant to evaluate a response against an
    associated query as well as optionally contexts.

    Args:
        query (str): The user query
        query_by (CreatedBy): Query generated by human or ai (model-name)
        contexts (Optional[List[str]]): The contexts used for response
        answer (str): Answer to the query that is to be evaluated.
        answer_by: The reference answer generated by human or ai (model-name).
        ground_truth_answer (Optional[str]):
        ground_truth_answer_by (Optional[CreatedBy]):
        reference_feedback (str): The reference feedback evaluation.
        reference_score (float): The reference score evaluation.
        reference_evaluation_by (CreatedBy): Evaluation generated by human or ai (model-name)

    """

    query: str = Field(
        default_factory=str, description="The user query for the example."
    )
    query_by: Optional[CreatedBy] = Field(
        default=None, description="What generated the query."
    )
    contexts: Optional[List[str]] = Field(
        default=None,
        description="The contexts used to generate the answer.",
    )
    answer: str = Field(
        default_factory=str,
        description="The provided answer to the example that is to be evaluated.",
    )
    answer_by: Optional[CreatedBy] = Field(
        default=None, description="What generated the answer."
    )
    ground_truth_answer: Optional[str] = Field(
        default=None,
        description="The ground truth answer to the example that is used to evaluate the provided `answer`.",
    )
    ground_truth_answer_by: Optional[CreatedBy] = Field(
        default=None, description="What generated the ground-truth answer."
    )
    reference_feedback: Optional[str] = Field(
        default=None,
        description="The reference feedback (ground-truth).",
    )
    reference_score: float = Field(
        default_factory=float, description="The reference score (ground-truth)."
    )
    reference_evaluation_by: Optional[CreatedBy] = Field(
        default=None, description="What generated the evaluation (feedback and score)."
    )

    @property
    def class_name(self) -> str:
        """Data example class name."""
        return "LabelledEvaluatorDataExample"


class EvaluatorPredictionDataset(BaseLlamaPredictionDataset):
    """Evaluation Prediction Dataset Class."""

    _prediction_type = EvaluatorExamplePrediction

    def to_pandas(self) -> Any:
        """Create pandas dataframe."""
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "pandas is required for this function. Please install it with `pip install pandas`."
            )

        data: Dict[str, List] = {
            "feedback": [],
            "score": [],
        }
        for pred in self.predictions:
            if not isinstance(pred, EvaluatorExamplePrediction):
                raise ValueError(
                    "EvaluatorPredictionDataset can only contain EvaluatorExamplePrediction instances."
                )
            data["feedback"].append(pred.feedback)
            data["score"].append(pred.score)

        return pd.DataFrame(data)

    @property
    def class_name(self) -> str:
        """Class name."""
        return "EvaluatorPredictionDataset"


class LabelledEvaluatorDataset(BaseLlamaDataset[BaseEvaluator]):
    """LabelledEvalationDataset class."""

    _example_type = LabelledEvaluatorDataExample

    def to_pandas(self) -> Any:
        """Create pandas dataframe."""
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "pandas is required for this function. Please install it with `pip install pandas`."
            )

        data: Dict[str, List] = {
            "query": [],
            "answer": [],
            "contexts": [],
            "ground_truth_answer": [],
            "query_by": [],
            "answer_by": [],
            "ground_truth_answer_by": [],
            "reference_feedback": [],
            "reference_score": [],
            "reference_evaluation_by": [],
        }

        for example in self.examples:
            if not isinstance(example, LabelledEvaluatorDataExample):
                raise ValueError(
                    "LabelledEvaluatorDataset can only contain LabelledEvaluatorDataExample instances."
                )
            data["query"].append(example.query)
            data["answer"].append(example.answer)
            data["contexts"].append(example.contexts)
            data["ground_truth_answer"].append(example.ground_truth_answer)
            data["query_by"].append(str(example.query_by))
            data["answer_by"].append(str(example.answer_by))
            data["ground_truth_answer_by"].append(str(example.ground_truth_answer_by))
            data["reference_feedback"].append(example.reference_feedback)
            data["reference_score"].append(example.reference_score)
            data["reference_evaluation_by"].append(str(example.reference_evaluation_by))

        return pd.DataFrame(data)

    async def _apredict_example(  # type: ignore
        self,
        predictor: BaseEvaluator,
        example: LabelledEvaluatorDataExample,
        sleep_time_in_seconds: int,
    ) -> EvaluatorExamplePrediction:
        """Async predict RAG example with a query engine."""
        await asyncio.sleep(sleep_time_in_seconds)
        try:
            eval_result: EvaluationResult = await predictor.aevaluate(
                query=example.query,
                response=example.answer,
                contexts=example.contexts,
                reference=example.ground_truth_answer,
                sleep_time_in_seconds=sleep_time_in_seconds,
            )
        except Exception as err:
            # TODO: raise warning here as well
            return EvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
            )

        if not eval_result.invalid_result:
            return EvaluatorExamplePrediction(
                feedback=eval_result.feedback or "", score=eval_result.score
            )
        else:
            return EvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=eval_result.invalid_reason
            )

    def _predict_example(  # type: ignore
        self,
        predictor: BaseEvaluator,
        example: LabelledEvaluatorDataExample,
        sleep_time_in_seconds: int = 0,
    ) -> EvaluatorExamplePrediction:
        """Predict RAG example with a query engine."""
        time.sleep(sleep_time_in_seconds)
        try:
            eval_result: EvaluationResult = predictor.evaluate(
                query=example.query,
                response=example.answer,
                contexts=example.contexts,
                reference=example.ground_truth_answer,
                sleep_time_in_seconds=sleep_time_in_seconds,
            )
        except Exception as err:
            # TODO: raise warning here as well
            return EvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
            )

        if not eval_result.invalid_result:
            return EvaluatorExamplePrediction(
                feedback=eval_result.feedback or "", score=eval_result.score
            )
        else:
            return EvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=eval_result.invalid_reason
            )

    def _construct_prediction_dataset(  # type: ignore
        self, predictions: Sequence[EvaluatorExamplePrediction]
    ) -> EvaluatorPredictionDataset:
        """Construct prediction dataset."""
        return EvaluatorPredictionDataset(predictions=predictions)

    @property
    def class_name(self) -> str:
        """Class name."""
        return "LabelledEvaluatorDataset"


class PairwiseEvaluatorExamplePrediction(BaseLlamaExamplePrediction):
    """
    Pairwise evaluation example prediction class.

    Args:
        feedback (Optional[str]): The evaluator's feedback.
        score (Optional[float]): The evaluator's score.
        evaluation_source (EvaluationSource): If the evaluation came from original order or flipped; or inconclusive.

    """

    feedback: str = Field(
        default_factory=str,
        description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
    )
    score: Optional[float] = Field(
        default=None,
        description="The generated (predicted) response that can be compared to a reference (ground-truth) answer.",
    )
    evaluation_source: Optional[EvaluationSource] = Field(
        default=None,
        description=(
            "Whether the evaluation comes from original, or flipped ordering. Can also be neither here indicating inconclusive judgement."
        ),
    )
    invalid_prediction: bool = Field(
        default=False, description="Whether or not the prediction is a valid one."
    )
    invalid_reason: Optional[str] = Field(
        default=None, description="Reason as to why prediction is invalid."
    )

    @property
    def class_name(self) -> str:
        """Data example class name."""
        return "PairwiseEvaluatorExamplePrediction"


class PairwiseEvaluatorPredictionDataset(BaseLlamaPredictionDataset):
    """Pairwise evaluation predictions dataset class."""

    _prediction_type = PairwiseEvaluatorExamplePrediction

    def to_pandas(self) -> Any:
        """Create pandas dataframe."""
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "pandas is required for this function. Please install it with `pip install pandas`."
            )

        data: Dict[str, List] = {
            "feedback": [],
            "score": [],
            "ordering": [],
        }
        for prediction in self.predictions:
            if not isinstance(prediction, PairwiseEvaluatorExamplePrediction):
                raise ValueError(
                    "PairwiseEvaluatorPredictionDataset can only contain PairwiseEvaluatorExamplePrediction instances."
                )
            data["feedback"].append(prediction.feedback)
            data["score"].append(prediction.score)
            data["ordering"].append(str(prediction.evaluation_source))

        return pd.DataFrame(data)

    @property
    def class_name(self) -> str:
        """Class name."""
        return "PairwiseEvaluatorPredictionDataset"


class LabelledPairwiseEvaluatorDataExample(LabelledEvaluatorDataExample):
    """Labelled pairwise evaluation data example class."""

    second_answer: str = Field(
        default_factory=str,
        description="The second answer to the example that is to be evaluated along versus `answer`.",
    )
    second_answer_by: Optional[CreatedBy] = Field(
        default=None, description="What generated the second answer."
    )

    @property
    def class_name(self) -> str:
        """Data example class name."""
        return "LabelledPairwiseEvaluatorDataExample"


class LabelledPairwiseEvaluatorDataset(BaseLlamaDataset[BaseEvaluator]):
    """
    Labelled pairwise evaluation dataset. For evaluating the evaluator in
    performing pairwise evaluations.

    Args:
        BaseLlamaDataset (_type_): _description_

    """

    _example_type = LabelledPairwiseEvaluatorDataExample

    def to_pandas(self) -> Any:
        """Create pandas dataframe."""
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "pandas is required for this function. Please install it with `pip install pandas`."
            )

        data: Dict[str, List] = {
            "query": [],
            "answer": [],
            "second_answer": [],
            "contexts": [],
            "ground_truth_answer": [],
            "query_by": [],
            "answer_by": [],
            "second_answer_by": [],
            "ground_truth_answer_by": [],
            "reference_feedback": [],
            "reference_score": [],
            "reference_evaluation_by": [],
        }
        for example in self.examples:
            if not isinstance(example, LabelledPairwiseEvaluatorDataExample):
                raise ValueError(
                    "LabelledPairwiseEvaluatorDataset can only contain LabelledPairwiseEvaluatorDataExample instances."
                )
            data["query"].append(example.query)
            data["answer"].append(example.answer)
            data["second_answer"].append(example.second_answer)
            data["contexts"].append(example.contexts)
            data["ground_truth_answer"].append(example.ground_truth_answer)
            data["query_by"].append(str(example.query_by))
            data["answer_by"].append(str(example.answer_by))
            data["second_answer_by"].append(str(example.second_answer_by))
            data["ground_truth_answer_by"].append(str(example.ground_truth_answer_by))
            data["reference_feedback"].append(example.reference_feedback)
            data["reference_score"].append(example.reference_score)
            data["reference_evaluation_by"].append(str(example.reference_evaluation_by))

        return pd.DataFrame(data)

    async def _apredict_example(  # type: ignore
        self,
        predictor: BaseEvaluator,
        example: LabelledPairwiseEvaluatorDataExample,
        sleep_time_in_seconds: int,
    ) -> PairwiseEvaluatorExamplePrediction:
        """Async predict evaluation example with an Evaluator."""
        await asyncio.sleep(sleep_time_in_seconds)
        try:
            eval_result: EvaluationResult = await predictor.aevaluate(
                query=example.query,
                response=example.answer,
                second_response=example.second_answer,
                contexts=example.contexts,
                reference=example.ground_truth_answer,
                sleep_time_in_seconds=sleep_time_in_seconds,
            )
        except Exception as err:
            # TODO: raise warning here as well
            return PairwiseEvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
            )

        if not eval_result.invalid_result:
            return PairwiseEvaluatorExamplePrediction(
                feedback=eval_result.feedback or "",
                score=eval_result.score,
                evaluation_source=EvaluationSource(eval_result.pairwise_source),
            )
        else:
            return PairwiseEvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=eval_result.invalid_reason
            )

    def _predict_example(  # type: ignore
        self,
        predictor: BaseEvaluator,
        example: LabelledPairwiseEvaluatorDataExample,
        sleep_time_in_seconds: int = 0,
    ) -> PairwiseEvaluatorExamplePrediction:
        """Predict RAG example with a query engine."""
        time.sleep(sleep_time_in_seconds)
        try:
            eval_result: EvaluationResult = predictor.evaluate(
                query=example.query,
                response=example.answer,
                second_response=example.second_answer,
                contexts=example.contexts,
                reference=example.ground_truth_answer,
                sleep_time_in_seconds=sleep_time_in_seconds,
            )
        except Exception as err:
            # TODO: raise warning here as well
            return PairwiseEvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=f"Caught error {err!s}"
            )

        if not eval_result.invalid_result:
            return PairwiseEvaluatorExamplePrediction(
                feedback=eval_result.feedback or "",
                score=eval_result.score,
                evaluation_source=EvaluationSource(eval_result.pairwise_source),
            )
        else:
            return PairwiseEvaluatorExamplePrediction(
                invalid_prediction=True, invalid_reason=eval_result.invalid_reason
            )

    def _construct_prediction_dataset(  # type: ignore
        self, predictions: Sequence[PairwiseEvaluatorExamplePrediction]
    ) -> PairwiseEvaluatorPredictionDataset:
        """Construct prediction dataset."""
        return PairwiseEvaluatorPredictionDataset(predictions=predictions)

    @property
    def class_name(self) -> str:
        """Class name."""
        return "LabelledPairwiseEvaluatorDataset"


# British English + American English
LabeledEvaluatorDataExample = LabelledEvaluatorDataExample
LabeledEvaluatorDataset = LabelledEvaluatorDataset
LabeledPairwiseEvaluatorDataExample = LabelledPairwiseEvaluatorDataExample
LabeledPairwiseEvaluatorDataset = LabelledPairwiseEvaluatorDataset
